# -*- coding: utf-8 -*-

import os
from docx import Document
from utils import mkdirs

source_dir = '../static/files/'

'''
无法读取全部内容
'''
def read_docx():
	for f in os.listdir(source_dir + 'docx/'):
		doc = Document(source_dir + f)
		content = [p.text for p in doc.paragraphs]
		print(content)

	# doc = Document(source_dir + '5b2065644f4841d1be4344419b9fc140.docx')
	# content = [p.text for p in doc.paragraphs]
	# print(content)
	# for f in doc.paragraphs:
	# 	print(f.text)

from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf

def read_pdf():
	tmp_dir = source_dir + 'texts/'
	mkdirs(tmp_dir)

	pdf_dir = source_dir + 'pdf/'
	for fd in os.listdir(pdf_dir):
		print(fd)

		fname = fd.replace('.pdf', '')
		with open(pdf_dir + fd, 'rb') as f:
			rsrcmgr = PDFResourceManager()
			retstr = StringIO()
			laparams = LAParams()

			device = TextConverter(rsrcmgr, retstr, laparams = laparams)
			process_pdf(rsrcmgr, device, f)
			device.close()
			content = retstr.getvalue()
			retstr.close()
			
			lines = str(content).split('\n')
			with open(tmp_dir + fname, 'w') as f:
				for l in lines:
					l = l.strip().replace(' ', '')
					if l:
						f.write(l + '\n')

if __name__ == '__main__':
	# read_docx()
	read_pdf()
